A COMPARATIVE STUDY OF MACHINE LEARNING(ML) AND DEEP NEURAL NETWORK(DNN) PERFORMANCES FOR HOUSING PRICE PREDICTION


ML vs. DNN in Housing Price Prediction


Introduction paragraph will be included which will briefly introduce what this project is about. What datasets we used? What machine learning algorithms we utilized. What deep learning algorithm I have used and how I try to improve its performance using hyperparameter tuning. Brief description of study results.


Setting Up a Working Environment

#Bringing necessary library
library(dplyr) 
library(ggplot2)
library(rsample)
library(caret)
library(h2o)
library(visdat)
library(recipes)
library(modeldata)
library(ggplot2)
library(plyr)
library(PupillometryR)
library(leaps)   
library(MASS)
library(car)
library(glmnet)
library(vip)
library(pls)
library(keras)
library(tfruns)
library(tfestimators)
library(tensorflow)
library(reticulate)
library(remotes)
library(recipes)
library(randomForest)
library(tidyverse)
library(ggmap)
library(sf)
library(RColorBrewer)
library(dplyr)
library(DiagrammeR)

Dataset Description

Paragraph about dataset description - Ames Housing Dataset and Boston Housing Dataset

#Brining Ames Housing dataset
ames <- AmesHousing::make_ames()
names(ames)
##  [1] "MS_SubClass"        "MS_Zoning"          "Lot_Frontage"      
##  [4] "Lot_Area"           "Street"             "Alley"             
##  [7] "Lot_Shape"          "Land_Contour"       "Utilities"         
## [10] "Lot_Config"         "Land_Slope"         "Neighborhood"      
## [13] "Condition_1"        "Condition_2"        "Bldg_Type"         
## [16] "House_Style"        "Overall_Qual"       "Overall_Cond"      
## [19] "Year_Built"         "Year_Remod_Add"     "Roof_Style"        
## [22] "Roof_Matl"          "Exterior_1st"       "Exterior_2nd"      
## [25] "Mas_Vnr_Type"       "Mas_Vnr_Area"       "Exter_Qual"        
## [28] "Exter_Cond"         "Foundation"         "Bsmt_Qual"         
## [31] "Bsmt_Cond"          "Bsmt_Exposure"      "BsmtFin_Type_1"    
## [34] "BsmtFin_SF_1"       "BsmtFin_Type_2"     "BsmtFin_SF_2"      
## [37] "Bsmt_Unf_SF"        "Total_Bsmt_SF"      "Heating"           
## [40] "Heating_QC"         "Central_Air"        "Electrical"        
## [43] "First_Flr_SF"       "Second_Flr_SF"      "Low_Qual_Fin_SF"   
## [46] "Gr_Liv_Area"        "Bsmt_Full_Bath"     "Bsmt_Half_Bath"    
## [49] "Full_Bath"          "Half_Bath"          "Bedroom_AbvGr"     
## [52] "Kitchen_AbvGr"      "Kitchen_Qual"       "TotRms_AbvGrd"     
## [55] "Functional"         "Fireplaces"         "Fireplace_Qu"      
## [58] "Garage_Type"        "Garage_Finish"      "Garage_Cars"       
## [61] "Garage_Area"        "Garage_Qual"        "Garage_Cond"       
## [64] "Paved_Drive"        "Wood_Deck_SF"       "Open_Porch_SF"     
## [67] "Enclosed_Porch"     "Three_season_porch" "Screen_Porch"      
## [70] "Pool_Area"          "Pool_QC"            "Fence"             
## [73] "Misc_Feature"       "Misc_Val"           "Mo_Sold"           
## [76] "Year_Sold"          "Sale_Type"          "Sale_Condition"    
## [79] "Sale_Price"         "Longitude"          "Latitude"
dim(ames)
## [1] 2930   81
#Bringing Boston Housing Dataset
boston <- Boston
names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
dim(boston)
## [1] 506  14

ML vs. DL Performance Analysis Workflow

DiagrammeR::grViz("               
digraph surveillance_diagram {    # 'digraph' means 'directional graph', then the graph name 

  # graph statement
  graph [layout = dot,
         rankdir = LR,            # layout left-to-right
         fontsize = 13]

  # nodes (circles)
  node [shape = circle,           # shape = circle
       fixedsize = true
       width = 2.1,
       height=1.5,
       fontsize=19,
       fontname = 'Helvetica-Bold',
       ]                      

  # individual component
  ames_data  [label = 'Ames\nhousing data', shape = cylinder, fontcolor='#3caea3', color='#f6d55c'] 
  divide_data [label = 'Divide Ames\ndata by types', shape = rect,fontcolor='#3caea3' color='#3caea3']
  numeric [label = 'Numeric \n variables', shape = rect,fontcolor='#3caea3' color='#3caea3']
  discrete [label = 'Discrete \n variables', shape = rect, fontcolor='#3caea3' color='#3caea3']
  ordinal [label = 'Ordinal \n variables', shape = rect,  fontcolor='#3caea3' color='#3caea3']
  categorical [label = 'Categorical \n variables', shape = rect,  fontcolor='#3caea3' color='#3caea3']
  reorder [label = 'Reorder\nordinal\nvariables', shape = rect,  fontcolor='#3caea3' color='#3caea3']
  one_hot [label = 'One-Hot\nEncoding', shape = rect,  fontcolor='#3caea3' color='#3caea3']
  training_set [label = '70%\n training set', shape = rect,fontcolor='#3caea3' color='#3caea3']
  testing_set  [label = '30% \ntesting\nvalidation set', shape = rect,  fontcolor='#3caea3' color='#3caea3']
  standardize [label = 'Standardize\nthe dataset', shape = rect,  fontcolor='#3caea3' color='#3caea3']
  eda [label = 'EDA', shape=oval, height=1.1, fontcolor='#20639b', color='#20639b']
  gis [label = 'GIS\nvisualization', shape=oval, fontcolor='#20639b', color='#20639b']
  machine_learning [label = 'Machine\nLearning\nmodels', shape=octagon, fontcolor = '#ed553b', color='#ed553b']
  deep_learning [label = 'Deep\nNeural\nNetwork', shape=doubleoctagon, fontcolor = '#ed553b', color='#ed553b']
  hyper_tuning [label = 'hyper-\nparameter\ntuning', shape=octagon, fontcolor = '#ed553b', color='#ed553b']
  performance [label = 'Performance\nComparison', fontcolor = '#ed553b', color='#ed553b']
  best_model [label = 'Select\nbest performing\nmodel',width=2.2, height=2.2 ,shape=tripleoctagon,fontcolor = '#173F5F', color='#173F5F']
  
  # Machine Learning Models
  mlr [label = 'Multiple\nLinear\nRegression', fontcolor = '#ed553b', color='#ed553b']
  regularization [label = 'Regulari-\nzation:\nLasso\nRidge\nENET', fontcolor = '#ed553b', color='#ed553b']
  tree_based [label = 'Tree-based:\nRandom Forest\nBagging', fontcolor = '#ed553b', color='#ed553b']
  dim_reduction [label = 'Dimension\nreduction:\nPCR\nPLSR',fontcolor = '#ed553b', color='#ed553b']
  
  #linking the components:
  ames_data -> divide_data -> {numeric discrete ordinal categorical}[style = dashed, color = '#3caea3']
  ordinal -> reorder[style = dashed, color = '#3caea3']
  categorical -> one_hot[style = dashed, color = '#3caea3']
  {one_hot reorder discrete numeric} -> training_set[style = dashed, color = '#3caea3']
  {one_hot reorder discrete numeric} -> testing_set[style = dashed, color = '#3caea3']
  {training_set testing_set} -> standardize[style = dashed, color = '#3caea3']
  standardize -> {eda gis}[style = dashed, color = '#20639b']
  {eda gis} -> machine_learning -> {mlr regularization tree_based dim_reduction}[style = dashed, color = '#ed553b']
  {eda gis} -> hyper_tuning -> deep_learning[style = dashed, color = '#ed553b']
  {mlr regularization tree_based dim_reduction deep_learning} -> performance -> best_model[style = dashed, color = '#ed553b']
  
 }")

Before developing statistical models, the original dataset needed to be pre-processed by employing the following steps:

  1. Extracting and separating variables from the dataset according to it types(numeric, discrete, categorical, ordinal)
  2. Re-ordering ordinal variables
  3. Pre-processing numeric variables
    • Partitioning numeric variables into training and testing Sets
    • Standardizing based on training data mean and standard deviation
  4. Pre-processing categorical variables(one-hot-vector encoding)
  5. Combining all variables for Deep Neural Network Training

Data Pre-processing

brief outlining of this section- simply state how things are organized

1. Extracting and separating variables from the dataset according to its types(numeric, discrete, categorical, ordinal, and response)

#Extracting Numeric Variables
ames_numeric <- ames %>% dplyr::select(Lot_Frontage, Lot_Area, Year_Built, 
                                       Year_Remod_Add, Mas_Vnr_Area, BsmtFin_SF_1,
                                       BsmtFin_SF_2, Bsmt_Unf_SF, Total_Bsmt_SF, 
                                       First_Flr_SF, Second_Flr_SF, Low_Qual_Fin_SF, 
                                       Gr_Liv_Area, Garage_Area, Wood_Deck_SF, 
                                       Open_Porch_SF, Enclosed_Porch, Three_season_porch, 
                                       Screen_Porch, Pool_Area, Misc_Val)

#Extracting Discrete Variables
ames_discrete <- ames %>% dplyr::select(Bsmt_Full_Bath, Bsmt_Half_Bath, Full_Bath, Half_Bath,
                                        Bedroom_AbvGr, Kitchen_AbvGr,
                                        TotRms_AbvGrd,  Fireplaces,Garage_Cars,
                                        Mo_Sold, Year_Sold)

#Extracting Categorical Variables
ames_categorical <- ames %>% dplyr::select(MS_SubClass, MS_Zoning, Street, 
                                           Alley,  Land_Contour,
                                           Lot_Config,                
                                           Neighborhood, Condition_1, Condition_2,
                                           Bldg_Type, House_Style, 
                                           Roof_Matl, Exterior_1st,
                                           Exterior_2nd, Mas_Vnr_Type, 
                                           Foundation,
                                           Heating, 
                                           Central_Air, 
                                           Garage_Type,   
                                           Misc_Feature, 
                                           Sale_Type, Sale_Condition)

#Extracting Ordinal Variables
ames_ordinal <- ames %>% dplyr::select(Lot_Shape, Land_Slope, 
                                       Overall_Qual, Overall_Cond, 
                                       Exter_Qual, Exter_Cond,
                                       Bsmt_Qual, Bsmt_Cond, Bsmt_Exposure, 
                                       BsmtFin_Type_1, BsmtFin_Type_2,
                                       Heating_QC, Electrical, Kitchen_Qual,
                                       Functional, Fireplace_Qu, Garage_Finish,
                                       Garage_Qual,Garage_Cond, 
                                       Paved_Drive, Pool_QC, Fence)

#Extracting Response Variable
ames_response <- ames %>% dplyr::select(Sale_Price)

2. Re-ordering ordinal variables

#Lot_Shape
Qual.levels <- c('Irregular', 'Moderately_Irregular', 'Slightly_Irregular','Regular')
ames_ordinal$Lot_Shape <- factor(ames_ordinal$Lot_Shape, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Lot_Shape <- as.integer(ames_ordinal$Lot_Shape)

#Land_Slope 
Qual.levels <- c('Gtl', 'Mod', 'Sev')
ames_ordinal$Land_Slope <- factor(ames_ordinal$Land_Slope, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Land_Slope <- as.integer(ames_ordinal$Land_Slope)

#Overall_Qual
Qual.levels <- c('Very_Poor', 'Poor', 'Fair', 'Below_Average', 'Average', 
                 'Above_Average', 'Good', 'Very_Good', 'Excellent', 'Very_Excellent')
ames_ordinal$Overall_Qual <- factor(ames_ordinal$Overall_Qual, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Overall_Qual <- as.integer(ames_ordinal$Overall_Qual)

#Overall_Cond
ames_ordinal$Overall_Cond <- factor(ames_ordinal$Overall_Cond, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Overall_Cond <- as.integer(ames_ordinal$Overall_Cond)

#Exter_Qual
Qual.levels <- c('Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Exter_Qual <- factor(ames_ordinal$Exter_Qual, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Exter_Qual <- as.integer(ames_ordinal$Exter_Qual)

#Exter_Cond
Qual.levels <- c('Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Exter_Cond <- factor(ames_ordinal$Exter_Cond, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Exter_Cond <- as.integer(ames_ordinal$Exter_Cond)

#Bsmt_Qual 
Qual.levels <- c('No_Basement','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Bsmt_Qual <- factor(ames_ordinal$Bsmt_Qual, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Bsmt_Qual <- as.integer(ames_ordinal$Bsmt_Qual)

#Bsmt_Cond 
Qual.levels <- c('No_Basement','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Bsmt_Cond <- factor(ames_ordinal$Bsmt_Cond, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Bsmt_Cond <- as.integer(ames_ordinal$Bsmt_Cond)

#Bsmt_Exposure 
Qual.levels <- c('No_Basement','No','Mn', 'Av', 'Gd')
ames_ordinal$Bsmt_Exposure <- factor(ames_ordinal$Bsmt_Exposure, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Bsmt_Exposure <- as.integer(ames_ordinal$Bsmt_Exposure)

#BsmtFin_Type_1
Qual.levels <- c('No_Basement','Unf','LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ')
ames_ordinal$BsmtFin_Type_1 <- factor(ames_ordinal$BsmtFin_Type_1, levels = Qual.levels, ordered = TRUE)
ames_ordinal$BsmtFin_Type_1 <- as.integer(ames_ordinal$BsmtFin_Type_1)

#BsmtFin_Type_2
Qual.levels <- c('No_Basement','Unf','LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ')
ames_ordinal$BsmtFin_Type_2 <- factor(ames_ordinal$BsmtFin_Type_2, levels = Qual.levels, ordered = TRUE)
ames_ordinal$BsmtFin_Type_2 <- as.integer(ames_ordinal$BsmtFin_Type_2)

#Heating_QC
Qual.levels <- c('Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Heating_QC <- factor(ames_ordinal$Heating_QC, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Heating_QC <- as.integer(ames_ordinal$Heating_QC)

#Electrical
Qual.levels <- c('Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr')
ames_ordinal$Electrical[ames$Electrical == 'Unknown'] <- 'SBrkr'
ames_ordinal$Electrical <- factor(ames_ordinal$Electrical, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Electrical <- as.integer(ames_ordinal$Electrical)

#Kitchen_Qual
Qual.levels <- c('Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Kitchen_Qual <- factor(ames_ordinal$Kitchen_Qual, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Kitchen_Qual <- as.integer(ames_ordinal$Kitchen_Qual)

#Functional
Qual.levels <- c('Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ')
ames_ordinal$Functional <- factor(ames_ordinal$Functional, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Functional <- as.integer(ames_ordinal$Functional)

#Fireplace_Qu 
Qual.levels <- c('No_Fireplace','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Fireplace_Qu <- factor(ames_ordinal$Fireplace_Qu, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Fireplace_Qu <- as.integer(ames_ordinal$Fireplace_Qu)

#Garage_Finish
Qual.levels <- c('No_Garage','Unf','RFn', 'Fin')
ames_ordinal$Garage_Finish <- factor(ames_ordinal$Garage_Finish, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Garage_Finish <- as.integer(ames_ordinal$Garage_Finish)

#Garage_Qual
Qual.levels <- c('No_Garage','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Garage_Qual <- factor(ames_ordinal$Garage_Qual, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Garage_Qual <- as.integer(ames_ordinal$Garage_Qual)

#Garage_Cond 
Qual.levels <- c('No_Garage','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Garage_Cond <- factor(ames_ordinal$Garage_Cond, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Garage_Cond <- as.integer(ames_ordinal$Garage_Cond)

#Paved_Drive
Qual.levels <- c('Dirt_Gravel','Partial_Pavement','Paved')
ames_ordinal$Paved_Drive <- factor(ames_ordinal$Paved_Drive, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Paved_Drive <- as.integer(ames_ordinal$Paved_Drive)

#Pool_QC
Qual.levels <- c('No_Pool','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Pool_QC <- factor(ames_ordinal$Pool_QC, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Pool_QC <- as.integer(ames_ordinal$Pool_QC)

#Fence
Qual.levels <- c('No_Fence','Minimum_Wood_Wire','Good_Wood', 'Minimum_Privacy', 'Good_Privacy')
ames_ordinal$Fence <- factor(ames_ordinal$Fence, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Fence <- as.integer(ames_ordinal$Fence)

glimpse(ames_ordinal)
## Rows: 2,930
## Columns: 22
## $ Lot_Shape      <int> 3, 4, 3, 4, 3, 3, 4, 3, 3, 4, 3, 3, 3, 4, 3, 2, 3, 4, 4…
## $ Land_Slope     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1…
## $ Overall_Qual   <int> 6, 5, 6, 7, 5, 6, 8, 8, 8, 7, 6, 6, 6, 7, 8, 8, 8, 9, 4…
## $ Overall_Cond   <int> 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 7, 2, 5…
## $ Exter_Qual     <int> 2, 2, 2, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 3, 4, 3, 3, 2…
## $ Exter_Cond     <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3…
## $ Bsmt_Qual      <int> 4, 4, 4, 4, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 6, 4…
## $ Bsmt_Cond      <int> 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Bsmt_Exposure  <int> 5, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 5, 4, 5, 4, 4, 2…
## $ BsmtFin_Type_1 <int> 5, 4, 6, 6, 7, 7, 7, 6, 7, 2, 2, 6, 2, 7, 7, 6, 7, 7, 3…
## $ BsmtFin_Type_2 <int> 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2…
## $ Heating_QC     <int> 2, 3, 3, 5, 4, 5, 5, 5, 5, 4, 4, 5, 4, 4, 3, 5, 4, 5, 5…
## $ Electrical     <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5…
## $ Kitchen_Qual   <int> 3, 3, 4, 5, 3, 4, 4, 4, 4, 4, 3, 3, 3, 4, 4, 5, 3, 5, 3…
## $ Functional     <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8…
## $ Fireplace_Qu   <int> 5, 1, 1, 4, 4, 5, 1, 1, 4, 4, 4, 1, 5, 2, 1, 5, 1, 6, 1…
## $ Garage_Finish  <int> 4, 2, 2, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 2, 3, 4, 3, 4, 2…
## $ Garage_Qual    <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Garage_Cond    <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Paved_Drive    <int> 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
## $ Pool_QC        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Fence          <int> 1, 4, 1, 1, 4, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1…

3. Pre-processing numeric variables

  • Partitioning numeric variables into training and testing Sets
  • Standardizing based on training data mean and standard deviation
ames_numeric <- cbind(ames_ordinal, ames_numeric, ames_discrete)

n= nrow(ames_numeric)
train_index = sample(1:n, n*0.7, replace = FALSE)

ames_numeric_train <- ames_numeric[train_index, ]
ames_numeric_test <- ames_numeric[-train_index, ]

#calculating mean and standard deviation from training data only for no prior info leak before model training 
mean_train <- apply(ames_numeric_train, 2, mean) 
sd_train <- apply(ames_numeric_train, 2, sd) 

#scaling training and testing data
ames_numeric_train <- scale(ames_numeric_train, center=mean_train, scale=sd_train)
ames_numeric_test <- scale(ames_numeric_test, center=mean_train, scale=sd_train) #now numeric data are scaled properly

The table below displays the resulting mean and standard deviation of pre-processed training dataset. As can be seen, all the numeric variables has mean of 0 and standard deviation of 1.


The following table shows the mean and standard deviation for “testing” dataset. Unlike the training dataset, the means and standard deviations are not exactly 0 and 1, but fairly close. This, as mentioned above, was done on purpose to ensure no information leak from the training dataset which will be used for DNN training.

4. Pre-processing categorical variables (one-hot encoding)

Machine Learning algorithms cannot use categorical variables directly as input. Instead, all data must be presented in a numerical format that machines can properly understand and process them. One-hot encoding is one of the most commonly used methods to achieve this. The following code demonstrates how I applied this technique to convert all my categorical variables into binary vectors, and the accompanying results below show that all variables have been processed as intended.

dummy_formula <- dummyVars(~., data=ames_categorical)
ames_categorical_one_hot <- predict(dummy_formula, newdata=ames_categorical)
ames_categorical_train <- ames_categorical_one_hot[train_index,]
ames_categorical_test <- ames_categorical_one_hot[-train_index,]
## Rows: 2,051
## Columns: 5
## $ MS_SubClass.One_Story_1946_and_Newer_All_Styles    <dbl> 1, 0, 0, 0, 0, 0, 0…
## $ MS_SubClass.One_Story_1945_and_Older               <dbl> 0, 0, 0, 0, 0, 0, 0…
## $ MS_SubClass.One_Story_with_Finished_Attic_All_Ages <dbl> 0, 0, 0, 0, 0, 0, 0…
## $ MS_SubClass.One_and_Half_Story_Unfinished_All_Ages <dbl> 0, 0, 0, 0, 0, 0, 0…
## $ MS_SubClass.One_and_Half_Story_Finished_All_Ages   <dbl> 0, 0, 0, 0, 0, 1, 0…

5. Combining all variables for Deep Neural Network Training

Now that all the variables in the Ames housing dataset has been properly pre-processed. As a final step, I am going to organize the exploratory and response variables into separate training and testing datasets. The results from the ‘dim()’ function demonstrates that we are using 70% of data as training set, while the remaining 30% is reserved for testing and validation. We are now ready to move on to the more exciting parts of our analysis: Exploratory Data Analysis(EDA) and actual machine learning!

ames_train_targets <- ames$Sale_Price[train_index] 
ames_test_targets <- ames$Sale_Price[-train_index]
ames_DN_train <- cbind(ames_numeric_train, ames_categorical_train) 
ames_DN_test <- cbind(ames_numeric_test, ames_categorical_test)
dim(as.data.frame(ames_DN_train))
## [1] 2051  239
dim(as.data.frame(ames_DN_test))
## [1] 879 239

Exploratory Data Analysis

Explanation about EDA

1. Visually inspecting Ames Neighborhoods

#Brining the shapefile for visualization 
ames_road <- st_read('../tl_2023_19169_roads', quiet=TRUE, layer="tl_2023_19169_roads")

#Utilizing color brewer palette
nb.cols <- 29 
myColors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
names(myColors) <- levels(ames$Neighborhood)
custom_colors <- scale_colour_manual(name = "Neighborhood", values = myColors)

#Settting the limit for lon, lat
xmin <- -93.70
xmax <- -93.60
ymin <- 41.99
ymax <- 42.06

#preparing a backgeround map
background <- ggplot(ames_road) + geom_sf(color="grey") +
    xlim(xmin, xmax) + ylim(ymin, ymax)

#Visualizing the Ames neighborhoods
background + geom_point(data=ames,mapping = aes(x=Longitude, y=Latitude, colour = Neighborhood), size=1.4) + custom_colors 

2. Examining the Response Variable

hist_saleprice <- 
  ggplot(ames, aes(x=Sale_Price)) + geom_histogram(bins=50, col="white") 

ames$Price_Category <- cut(ames$Sale_Price, 
                           breaks = c(-Inf, 100000, 200000, 300000, 400000, Inf),
                           label = c("Very Low", "Low", "Medium", "High", "Very High"))

custom_palette <- c("Very Low"="blue", "Low"="lightblue", "Medium"="yellow", "High"="orange", "Very High"="red")

map_saleprice<-
  background + 
  geom_point(data=ames, mapping = aes(x=Longitude, y=Latitude, colour = Price_Category), size=1.4) + 
  scale_color_manual(values = custom_palette) +
  guides(colour=guide_legend(nrow=2)) +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        legend.text = element_text(size = 9))

hist_saleprice + map_saleprice + plot_layout(ncol = 2)

2. Lot Size vs. Sale_price

hist_lot <- ggplot(ames, aes(x=Lot_Area)) + 
  geom_histogram(bins=50, col="white") + 
  xlim(0, 50000) #Lot_Area (Continuous): Lot size in square feet

lot_price_scatter <-
  ggplot(ames, aes(x=Lot_Area, y=Sale_Price)) + geom_point() + 
  xlim(0, 30000) +
  geom_smooth(se=TRUE)

hist_lot + lot_price_scatter + plot_layout(ncol = 2)

3. Gr_Liv_Area vs. Sale_price

#scatter plot between Gr_Liv_Area and Sale_Price
Gr_Liv_SalePrice_scatter <- ggplot(ames, aes(x=Gr_Liv_Area, y=Sale_Price)) + geom_point() + geom_smooth(se=TRUE) 

#defining class breaks based on Gr_Liv_Area
ames$Gr_Liv_Area_category <- cut(ames$Gr_Liv_Area, breaks = c(0, 500, 1500, 2000, 2500, 5642),
                                 labels = c("< 500", "500-1499", "1500-1999", "2000-2499", "2500+"),
                                 include.lowest = TRUE,
                                 right=FALSE)

#defining a custom palette 
custom_palette <- c("< 500" = "blue", "500-1499" = "lightblue", "1500-1999" = "yellow", "2000-2499" = "orange", "2500+" = "red")

#Creatring a map showing the spatial distribution of Gr_Liv_Area
map_Gr_Liv_Area<-
  background + 
  geom_point(data=ames, mapping = aes(x=Longitude, y=Latitude, colour = Gr_Liv_Area_category), size=1.4) + 
  scale_color_manual(values = custom_palette) +
  guides(colour=guide_legend(nrow=2)) +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        legend.text = element_text(size = 9))

Gr_Liv_SalePrice_scatter + map_Gr_Liv_Area + plot_layout(ncol = 2)

4. Year_Built vs. Sale_price

#histogram for Year_Built 
hist_year_built  <- ggplot(ames, aes(x=Year_Built)) + geom_histogram(bins=45, col="white")

#scatter plot for Year_Built vs. Sale_Price
year_built_saleprice_scatter <- ggplot(ames, aes(x=Year_Built, y=Sale_Price)) + geom_point() + geom_smooth(se=TRUE) 

#Defining class breaks for a Year_Built variable 
ames$year_built_category <- cut(ames$Year_Built, breaks = c(-Inf, 1900, 1930, 1960, 1990, 2000, Inf),
                           label = c("before_1900", "00_30", "30_60", "60_90", "90_20", "after_20"))

#defining a custom palette
custom_palette <- c("before_1900"="#ffffcc", "00_30"="#c7e9b4", "30_60"="#7fcdbb", "60_90"="#41b6c4", "90_20"="#2c7fb8", "after_20"="#253494")

#Spatial Distribution of Year_Built
map_year_built <-
  background + 
  geom_point(data=ames, mapping = aes(x=Longitude, y=Latitude, colour = year_built_category), size=1.4) + 
  scale_color_manual(values = custom_palette)

5. Overall_Quality vs. Sale_price

boxplot_condition_price <- ggplot(ames, aes(x=Overall_Qual, y=Sale_Price)) + geom_boxplot(aes(colour=Overall_Qual), show.legend = FALSE) 

map_condition <- 
background + 
  geom_point(data=ames, mapping = aes(x=Longitude, y=Latitude, colour = Overall_Qual), size=0.8) +
  scale_color_brewer(palette = "Spectral") 
  

boxplot_condition_price + map_condition + plot_layout(ncol=2)

6. External Condition vs. Sale_price

boxplot_external_qual_price <- 
  ggplot(ames, aes(x=Exter_Qual, y=Sale_Price)) + 
  geom_boxplot(aes(colour=Exter_Qual), show.legend = FALSE)


map_external_qual <- 
  background + geom_point(data=ames, mapping = aes(x=Longitude, y=Latitude, colour = Exter_Qual), size=0.8) +
  scale_color_brewer(palette = "Spectral")+
  guides(color = guide_legend(title = NULL))

boxplot_external_qual_price + map_external_qual + plot_layout(ncol=2)

Performance Comparison of Machine Learning(ML) Models for Housing Price Prediction

Explanation about Performance Comparison of Machine Learning(ML) Models for Housing Price Prediction

n = nrow(ames)
rep = 15

#creating empty vector to save training & testing error for each model
mse.train.model3 = mse.train.lasso = mse.train.ridge = 
mse.train.enet = mse.train.rf= mse.train.bag =
mse.train.pcr = mse.train.plsr = rep(0,15) 

mse.test.model3 = mse.test.lasso = mse.test.ridge = 
mse.test.enet = mse.test.rf = mse.test.bag =
mse.test.pcr = mse.test.plsr = rep(0,15) 

#loop
set.seed(1234)

for(i in 1:rep){
  cat("Processing loop #", i, "\n")
  train_index = sample(1:n, n*0.7, replace = FALSE)
  ames_train = ames[train_index, ]
  ames_test = ames[-train_index, ]
  X_train = model.matrix(Sale_Price ~ ., ames_train)[,-1]
  Y_train = ames_train$Sale_Price
  X_test = model.matrix(Sale_Price ~., ames_test)[,-1]
  Y_test = ames_test$Sale_Price
  
  #model3 - linear regression model using all variables
  model3 <- glmnet(x=X_train, y=Y_train, lambda = 0, alpha=0) 
  pred.train.model3 <- predict(model3, X_train)
  pred.test.model3 <- predict(model3, X_test) 
  mse.train.model3[i] <- MAE(pred.train.model3, ames_train$Sale_Price)
  mse.test.model3[i] <- MAE(pred.test.model3, ames_test$Sale_Price)
  
  #lasso
  cvfitl = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=1,standardize=TRUE)
  pred.train.l = predict(cvfitl, newx = X_train, s = "lambda.min") 
  pred.test.l = predict(cvfitl, newx = X_test, s = "lambda.min") 
  mse.train.lasso[i] = MAE(pred.train.l, Y_train)
  mse.test.lasso[i] = MAE(pred.test.l, Y_test)
  
  #ridge
  cvfitr = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=0,standardize=TRUE)
  pred.train.r = predict(cvfitr, newx = X_train, s = "lambda.min") 
  pred.test.r = predict(cvfitr, newx = X_test, s = "lambda.min") 
  mse.train.ridge[i] = MAE(pred.train.r, Y_train)
  mse.test.ridge[i] = MAE(pred.test.r, Y_test)
  
  #ENET
  cvfiten = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=0.5,standardize=TRUE)
  pred.train.en = predict(cvfiten, newx = X_train, s = "lambda.min") 
  pred.test.en = predict(cvfiten, newx = X_test, s = "lambda.min") 
  mse.train.enet[i] = MAE(pred.train.en, Y_train)
  mse.test.enet[i] = MAE(pred.test.en, Y_test)
  
  #PCR
  numeric_cols = sapply(ames, is.numeric)
  ames_numeric = ames[,numeric_cols]
  ames_train_pcr = ames_numeric[train_index,]
  ames_test_pcr = ames_numeric[-train_index,]
  pcr <- pcr(Sale_Price ~., data=ames_train_pcr, scale=TRUE, ncomp=30) #ncomp has been selected through cross-validation <- need to do it again
  pcr.predict.train = predict(pcr, ncomp=30)
  pcr.predict.test = predict(pcr, newdata=ames_test_pcr, ncomp=30)
  mse.train.pcr[i] = MAE(pcr.predict.train, ames_train_pcr$Sale_Price)
  mse.test.pcr[i] = MAE(pcr.predict.test, ames_test_pcr$Sale_Price)
  
  #PLSR
  plsr <- pls::plsr(Sale_Price ~., data=ames_train_pcr, scale=TRUE, ncomp=6) #ncomp has been selected through cross-validation
  plsr.predict.train = predict(plsr, ncomp=6)
  plsr.predict.test = predict(plsr, newdata=ames_test_pcr, ncomp=6)
  mse.train.plsr[i] = MAE(plsr.predict.train, ames_train_pcr$Sale_Price)
  mse.test.plsr[i] = MAE(plsr.predict.test, ames_test_pcr$Sale_Price)
  
  #Random Forest
  rf <- randomForest(Sale_Price~., data=ames_train, mtry=8)
  rf.predict.train = predict(rf)
  rf.predict.test =predict(rf, newdata = ames_test[,!names(ames_test)%in%c("Sale_Price")])
  mse.train.rf[i] = MAE(rf.predict.train, ames_train$Sale_Price)
  mse.test.rf[i] = MAE(rf.predict.test, ames_test$Sale_Price)

  #bagging
  bag <- randomForest(Sale_Price~., data=ames_train, mtry=57)
  bag.predict.train = predict(bag)
  bag.predict.test =predict(bag, newdata = ames_test[,!names(ames_test)%in%c("Sale_Price")])
  mse.train.bag[i] = MAE(bag.predict.train, ames_train$Sale_Price)
  mse.test.bag[i] = MAE(bag.predict.test, ames_test$Sale_Price)
  
}

### Boxplot for mse test
mse.test.combined <- c(mse.test.model3, mse.test.lasso, mse.test.ridge, 
                       mse.test.enet, mse.test.pcr, mse.test.plsr, 
                       mse.test.rf,mse.test.bag)

index <- rep(c("test.model3", "test.lasso", "test.ridge", 
               "test.enet", "test.pcr", "test.plsr", 
               "test.rf","test.bag"), each=15)

mse.test.combined <- cbind(mse.test.combined, index)
mse.test.combined <- as.data.frame(mse.test.combined)
mse.test.combined$mse.test.combined <- as.numeric(mse.test.combined$mse.test.combined) 

ggplot(data=mse.test.combined, aes(x=index, y=mse.test.combined)) + geom_boxplot()

### Boxplot for mse training
mse.train.combined <- c(mse.train.model3, mse.train.lasso, mse.train.ridge, 
                        mse.train.enet, mse.train.pcr, mse.train.plsr, 
                        mse.train.rf, mse.train.bag)

index <- rep(c("train.model3", "train.lasso", "train.ridge", 
               "train.enet", "train.pcr","train.plsr", 
               "train.rf", "train.bag"), each=15)

mse.train.combined <- cbind(mse.train.combined, index)
mse.train.combined <- as.data.frame(mse.train.combined)
mse.train.combined$mse.train.combined <- as.numeric(mse.train.combined$mse.train.combined) 

ggplot(data=mse.train.combined, aes(x=index, y=mse.train.combined)) + geom_boxplot()    

Deep Neural Network(DNN) for Housing Price Prediction

Explanation about Deep Neural Network(DNN) for Housing Price Prediction

## Initial Deep Learning Model
## Initial model looks like the following 
build_model <- function(){ #because we need to instantiate the same model multiple times, we use function to construct it.
  
  model <- keras_model_sequential() %>%
    layer_dense(64, activation = "relu", input_shape = ncol(ames_DN_train)) %>%
    layer_dense(64, activation = "relu") %>%
    layer_dense(1)
  
  model %>% compile(optimizer = optimizer_rmsprop(), 
                    loss = "mse",
                    metrics = "mae")
  
  model
}

## I went through model fine tuning process, 1) first by manually tweaking hyperparameters 2) second by utilizing grid search
FLAGS <- flags(
  # Nodes
  flag_numeric("nodes1", 256),
  flag_numeric("nodes2", 128),
  # Dropout
  flag_numeric("dropout1", 0.4),
  flag_numeric("dropout2", 0.3),
  #learning parameters
  flag_string("optimizer", "rmsprop"),
  # batch_size
  flag_numeric("batch_size", 64)
)

model <- keras_model_sequential() %>%
  layer_dense(units = FLAGS$nodes1, activation = "relu", input_shape = ncol(ames_DN_train)) %>%
  layer_dropout(rate = FLAGS$dropout1) %>%
  layer_dense(units = FLAGS$nodes2, activation = "relu") %>%
  layer_dropout(rate = FLAGS$dropout2) %>%
  layer_dense(units = 1) %>%
  compile(
    loss = "mse",
    metrics = "mae",
    optimizer = FLAG$optimizer
  ) %>%
  fit(
    x = ames_DN_train,
    y = ames_train_targets,
    epochs = 500,
    batch_size = FLAGS$batch_size,
    validation_split = 0.2,
    callbacks = list(
      callback_early_stopping(patience = 25)
    ),
    verbose = 0
  )

runs <- tuning_run(file = "C:/coding/R/scripts/ames_grid_search.R", 
                   flags = list(
                     nodes1 = c(64, 128, 256),
                     nodes2 = c(64, 128, 256),
                     dropout1 = c(0.2, 0.3, 0.4),
                     dropout2 = c(0.2, 0.3, 0.4),
                     optimizer = c("rmsprop", "adam"),
                     batch_size = c(16,32,64,128)
                    ),
                   sample=0.7
                   )

# Rows: 1
# Columns: 28
# $ X                <int> 129
# $ run_dir          <chr> "runs/2024-06-04T05-23-16Z"
# $ metric_loss      <int> 653320960
# $ metric_mae       <dbl> 17894.76
# $ metric_val_loss  <int> 613711040
# $ metric_val_mae   <dbl> 14674.82
# $ flag_nodes1      <int> 128
# $ flag_nodes2      <int> 128
# $ flag_dropout1    <dbl> 0.4
# $ flag_dropout2    <dbl> 0.2
# $ flag_batch_size  <int> 16
# $ epochs           <int> 500
# $ epochs_completed <int> 113
# $ metrics          <chr> "runs/2024-06-04T05-23-16Z/tfruns.d/metrics.json"
# $ model            <chr> "Model: \"sequential\"\n________________________________________________________________________________\n Layer (type)        ???
# $ loss_function    <chr> "mse"
# $ optimizer        <chr> "<keras.src.optimizers.rmsprop.RMSprop object at 0x000002A99E9CFBD0>"
# $ learning_rate    <dbl> 0.01
# $ script           <chr> "ames_grid_search.R"
# $ start            <chr> "2024-06-04 05:23:17.35663"
# $ end              <chr> "2024-06-04 05:23:40.85267"
# $ completed        <lgl> TRUE
# $ output           <chr> "\n> #source : https://bradleyboehmke.github.io/HOML/deep-learning.html\n> \n> # FLAGS <- flags(\n> #   # Nodes\n> #   flag_num???
#   $ source_code      <chr> "runs/2024-06-04T05-23-16Z/tfruns.d/source.tar.gz"
# $ context          <chr> "local"
# $ type             <chr> "training"



## Building Deep Learning model; after hyperparameter fine tuning, I found the following best performing DN Model
## As can be seen, I, aided by grid search above, have added l2_regularization, layer_dropout, and adjusted unit number for each layer.
## lastly, I used adam optimizer instead of rmsprop. <-- in the portfolio, I need to briefly explain how each parameter helps. 

build_model <- function(){ #because we need to instantiate the same model multiple times, we use function to construct it.
  
  model <- keras_model_sequential() %>%
    layer_dense(128, activation = "relu", kernel_regularizer = regularizer_l2(0.001),input_shape = ncol(ames_DN_train)) %>%
    layer_dropout(rate=0.4) %>%
    layer_dense(128, activation = "relu", kernel_regularizer = regularizer_l2(0.001)) %>%
    layer_dropout(rate=0.2) %>%
    layer_dense(1)
  
  model %>% compile(optimizer = optimizer_adam(), 
                    loss = "mse",
                    metrics = "mae")
  
  model

 }

#Training the final deep neural network model for 500 epochs using k-fold cross validation
k <- 5
fold_id <- sample(rep(1:k, length.out = nrow(ames_DN_train)))
num_epochs <- 2000
all_mae_histories <- list()

for(i in 1:k) {
  cat("Processing fold #", i, "\n")
  val_indices <- which(fold_id == i)
  val_data <- ames_DN_train[val_indices, ] #prepare the validation data 
  val_targets <- ames_train_targets[val_indices]
  partial_train_data <- ames_DN_train[-val_indices,]
  partial_train_targets <- ames_train_targets[-val_indices]
  
  model <- build_model()
  
  history <- model %>% fit(
    partial_train_data,
    partial_train_targets,
    validation_data = list(val_data, val_targets),
    epochs = num_epochs,
    batch_size = 32,
    # callbacks = list(
    #   #callback_early_stopping(patience = 5),
    #   callback_reduce_lr_on_plateau(factor = 0.05)
    # ),
    verbose = 0
  )
  
  mae_history <- history$metrics$val_mae
  
  all_mae_histories[[i]] <- mae_history
  
}

all_mae_histories <- do.call(cbind, all_mae_histories)
plot(history)
average_mae_history <- rowMeans(all_mae_histories) #calculating average per epoch
plot(average_mae_history, xlab="epoch",  type = 'l')


truncated_mae_history <- average_mae_history[-(1:200)]
 plot(average_mae_history, xlab="epoch",  type = 'l', ylim = range(truncated_mae_history))

min <-which.min(average_mae_history)
#[1] 936
average_mae_history[min] #901 = 13907.79

### Best model chosen by grid search
#min<- 950
model <- build_model()

history <- model %>%
  fit(ames_DN_train, ames_train_targets, epoch = min*1.2, batch_size = 32) 
#validation_split = 0.2)

result <- model %>% evaluate(ames_DN_test, ames_test_targets)

result["mae"] 


ames_dn_test_results <- rep(0, 15)

for(i in 1:15) {
  cat("Processing Loop #", i, "\n")
  model <- build_model()
  model %>% fit(ames_DN_train, ames_train_targets, #train it on the entirety of the data
                epochs = min*1.2, batch_size = 32, verbose = 0)
  result <- model %>% evaluate(ames_DN_test, ames_test_targets)
  ames_dn_test_results[i] <- result["mae"] 
}

#result from baseline model #this looks better than the model below hmm...
ames_dn_test_results <- c(15005.39, 14925.67, 14870.62, 
                          14981.25, 14965.23, 14908.94,
                          15131.90, 14907.56, 15028.89,
                          15025.54, 14952.75, 14925.85,
                          14945.20, 15219.53, 15198.08)

Performance Comparison between Machine Learning and Deep Neural Network

Explanation about Performance Comparison between Machine Learning and Deep Neural Network

# ensemble of DN and RF? ###
mse.test.rf
ames_dn_test_results

mse.ensemble <- 0.8*ames_dn_test_results + 0.2*mse.test.rf

### After Deep Learning Include the following for the final model comparison
mse.test.combined <- c(mse.test.model3, mse.test.lasso, mse.test.ridge, 
                       mse.test.enet, mse.test.pcr, mse.test.plsr, 
                       mse.test.rf,ames_dn_test_results, mse.ensemble) #mse.test.bag,

index <- rep(c("test.model3", "test.lasso", "test.ridge", 
               "test.enet", "test.pcr", "test.plsr",   #"test.bag",
               "test.rf","test.DN", "test.ensemble"), 
                each=15)

mse.test.combined <- cbind(mse.test.combined, index)
mse.test.combined <- as.data.frame(mse.test.combined)
mse.test.combined$mse.test.combined <- as.numeric(mse.test.combined$mse.test.combined) 

ggplot(data=mse.test.combined, aes(x=index, y=mse.test.combined)) + geom_boxplot()

Implementing the Same ML Methods on Boston Housing Dataset

Explanation about Implementing the Same Methods on Boston Housing Dataset

boston <- Boston
str(Boston)
dim(boston)
n = nrow(boston)
rep = 20


mse.train.model1 = mse.train.model2 = mse.train.model3 = 
  mse.train.lasso = mse.train.ridge = mse.train.enet = rep(0,20) #creating empty vector to save training error for each model

mse.test.model1 = mse.test.model2 = mse.test.model3 = 
  mse.test.lasso = mse.test.ridge = mse.test.enet = rep(0,20) #creating empty vector to save training error for each model

#loop
set.seed(1234)

for(i in 1:rep){
  cat("Processing Loop #", i, "\n\n")
  train_index = sample(1:n, n*0.7, replace = FALSE)
  boston_train = boston[train_index, ]
  boston_test = boston[-train_index, ]
  X_train = model.matrix(medv ~ ., boston_train)[,-1]
  Y_train = boston_train$medv
  X_test = model.matrix(medv ~., boston_test)[,-1]
  Y_test = boston_test$medv
  
  #model1
  model1 = lm(medv ~ rm, data = boston_train)
  pred.train.model1 = predict(model1)
  pred.test.model1 = predict(model1, newdata=boston_test[,!names(boston_test)%in%c("medv")])
  mse.train.model1[i] = MAE(pred.train.model1, boston_train$medv) 
  #sqrt(sum((boston_train$medv - pred.train.model1)^2)/length(boston_train$medv))
  mse.test.model1[i] = MAE(pred.test.model1, boston_test$medv)
  #sqrt(sum((boston_test$medv - pred.test.model1)^2)/length(boston_test$medv))
  
  #model2
  model2 = lm(medv ~ rm + tax + ptratio + black + chas, data = boston_train)
  pred.train.model2 = predict(model2)
  pred.test.model2 = predict(model2, newdata=boston_test[,!names(boston_test)%in%c("medv")])
  mse.train.model2[i] = MAE(pred.train.model2, boston_train$medv)
  #sqrt(sum((boston_train$medv - pred.train.model2)^2)/length(boston_train$medv))
  mse.test.model2[i] = MAE(pred.test.model2, boston_test$medv)
  #sqrt(sum((boston_test$medv - pred.test.model2)^2)/length(boston_test$medv))
  
  #model3
  model3 <- glmnet(x=X_train, y=Y_train, lambda = 0, alpha=0)
  pred.train.model3 <- predict(model3, X_train)
  pred.test.model3 <- predict(model3, X_test) 
  mse.train.model3[i] <- MAE(pred.train.model3, boston_train$medv)
  mse.test.model3[i] <- MAE(pred.test.model3, boston_test$medv)
  
  #lasso
  cvfitl = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=1,standardize=TRUE)
  pred.train.l = predict(cvfitl, newx = X_train, s = "lambda.min") 
  pred.test.l = predict(cvfitl, newx = X_test, s = "lambda.min") 
  mse.train.lasso[i] = MAE(pred.train.l, Y_train)
  mse.test.lasso[i] = MAE(pred.test.l, Y_test)
  
  #ridge
  cvfitr = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=0,standardize=TRUE)
  pred.train.r = predict(cvfitr, newx = X_train, s = "lambda.min") 
  pred.test.r = predict(cvfitr, newx = X_test, s = "lambda.min") 
  mse.train.ridge[i] = MAE(pred.train.r, Y_train)
  mse.test.ridge[i] = MAE(pred.test.r, Y_test)
  
  #ENET
  cvfiten = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=0.5,standardize=TRUE)
  pred.train.en = predict(cvfiten, newx = X_train, s = "lambda.min") 
  pred.test.en = predict(cvfiten, newx = X_test, s = "lambda.min") 
  mse.train.enet[i] = MAE(pred.train.en, Y_train)
  mse.test.enet[i] = MAE(pred.test.en, Y_test)
  
}

### Boxplot for mse test
mse.test.combined <- c(mse.test.model1, mse.test.model2, mse.test.model3, mse.test.lasso, mse.test.ridge, mse.test.enet)
index <- rep(c("test.model1", "test.model2", "test.model3", "test.lasso", "test.ridge", "test.enet"), each=20)
mse.test.combined <- cbind(mse.test.combined, index)
mse.test.combined <- as.data.frame(mse.test.combined)
mse.test.combined$mse.test.combined <- as.numeric(mse.test.combined$mse.test.combined) 

ggplot(data=mse.test.combined, aes(x=index, y=mse.test.combined)) + geom_boxplot()

### Boxplot for mse training
mse.train.combined <- c(mse.train.model1, mse.train.model2, mse.train.model3, mse.train.lasso, mse.train.ridge, mse.train.enet)
index <- rep(c("train.model1", "train.model2", "train.model3", "train.lasso", "train.ridge", "train.enet"), each=20)
mse.train.combined <- cbind(mse.train.combined, index)
mse.train.combined <- as.data.frame(mse.train.combined)
mse.train.combined$mse.train.combined <- as.numeric(mse.train.combined$mse.train.combined) 

ggplot(data=mse.train.combined, aes(x=index, y=mse.train.combined)) + geom_boxplot()

Implementing the Same DNN Method on Boston Housing Dataset

Explanation about Deep Neural Network

#Deep Learning...with Boston data first ->, then move on to using AMES dataset. This may more complicated process
#because AMES dataset contains a lot of categorical variable. I need to make sure how I should preprocess all those variables. 
boston <- Boston
names(boston)
predictors <- as.matrix(boston[,as.numeric(1:13)])
targets <- as.matrix(boston[ ,14])
n <- nrow(boston)
set.seed(1234)

train_index <- sample(1:n, 404, replace=FALSE)
train_data <- predictors[train_index,]
train_targets <- targets[train_index]
test_data <- predictors[-train_index,] 
test_targets <- targets[-train_index]

mean <- apply(train_data, 2, mean)
sd <- apply(train_data, 2, sd)
train_data <- scale(train_data, center = mean, scale = sd)
test_data <- scale(test_data, center =mean, scale = sd)


build_model <- function(){ #because we need to instantiate the same model multiple times, we use function to construct it.
  model <- keras_model_sequential() %>%
    layer_dense(64, activation = "relu") %>%
    #layer_dropout(rate = 0.2) %>%
    #layer_batch_normalization() %>%
    layer_dense(64, activation = "relu") %>%
    #layer_dropout(rate = 0.2) %>%
    #layer_batch_normalization() %>%
    layer_dense(1)
  
  #, kernel_regularizer = regularizer_l2(0.001)
  model %>% compile(optimizer = "rmsprop",
                    loss = "mse",
                    metrics = "mae")
  
  model
  
}


#Let's try training the model a bit longer: 500 epochs
#k-fold validation
k <- 4
fold_id <- sample(rep(1:k, length.out = nrow(train_data)))
num_epochs <- 500
all_mae_histories <- list()

for(i in 1:k) {
  cat("Processing fold #", i, "\n")
  val_indices <- which(fold_id == i)
  val_data <- train_data[val_indices, ] #prepare the validation data 
  val_targets <- train_targets[val_indices]
  partial_train_data <- train_data[-val_indices,]
  partial_train_targets <- train_targets[-val_indices]
  
  model <- build_model()
  
  history <- model %>% fit(
    partial_train_data,
    partial_train_targets,
    validation_data = list(val_data, val_targets),
    epochs = num_epochs,
    batch_size = 16,
    verbose = 0
  )
  
  mae_history <- history$metrics$val_mae
  all_mae_histories[[i]] <- mae_history
}

all_mae_histories <- do.call(cbind, all_mae_histories)
plot(history)
average_mae_history <- rowMeans(all_mae_histories) #calculating average per epoch
plot(average_mae_history, xlab="epoch",  type = 'l')

truncated_mae_history <- average_mae_history[-(1:20)]

plot(average_mae_history, xlab="epoch",  type = 'l',
     ylim = range(truncated_mae_history))

min = which.min(average_mae_history)
#training the final model
model <- build_model() 

history <- model %>% fit(train_data, train_targets, #train it on the entirety of the data
                         epochs = min, batch_size = 16)

result <- model %>% evaluate(test_data, test_targets)
result["mae"]

DN.test.mae <- rep(0,20)


for(i in 1:20){
  cat("Loop #", i, "\n")
  model <- build_model() 
  model %>% fit(train_data, train_targets, #train it on the entirety of the data
                epochs = min*1.2, batch_size = 16, verbose = 0)
  
  result <- model %>% evaluate(test_data, test_targets)
  
  DN.test.mae[i] <- result["mae"]
  
}


DN.test.mae
mean(DN.test.mae)
### Boxplot for mse test
mse.test.combined <- c(mse.test.model1, mse.test.model2, mse.test.model3, 
                       mse.test.lasso, mse.test.ridge, mse.test.enet, DN.test.mae)

index <- rep(c("test.model1", "test.model2", "test.model3", "test.lasso", "test.ridge", "test.enet", "test.DN"), each=20)

mse.test.combined <- cbind(mse.test.combined, index)
mse.test.combined <- as.data.frame(mse.test.combined)
mse.test.combined$mse.test.combined <- as.numeric(mse.test.combined$mse.test.combined) 

ggplot(data=mse.test.combined, aes(x=index, y=mse.test.combined)) + geom_boxplot()

Project Summary and Discussion

Project Summary and Discussion


#change